package com.stack.wemedia.test;

import com.google.gson.Gson;
import com.stack.wemedia.WemediaApplication;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.runner.RunWith;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.test.context.junit4.SpringRunner;

import java.io.IOException;
import java.util.List;

@SpringBootTest(classes = WemediaApplication.class)
@RunWith(SpringRunner.class)
@Slf4j
public class NewsScraper {
    public static void main(String[] args) throws IOException {

        /**
         * 直播吧
         */
/*        String url = "https://m.zhibo8.com/news/web/nba/2023-11-30/65680f9760b8dnative.htm";

        // 获取该网页document文档数据
        Document document = Jsoup.connect(url)
                .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36")
                .get();

        log.info("document: {}", document);

        String title = document.getElementsByClass("title").text();
        log.info("新闻的标题: {}", title);

        Elements imgs = document.getElementsByClass("img_wrap");
        for (Element img : imgs) {
            String imgUrl = img.attr("data");
            log.info("imgUrl: {}", imgUrl);
        }

        Elements elements = document.getElementsByTag("p");
        String text = " ";
        for (Element element : elements) {
            if (StringUtils.isNotBlank(element.text())) {
                text = text + "\n" + element.text() ;
            }
        }
        log.info("新闻内容: {}", text);*/


        /**
         * 博客园
         */
        String url = "https://www.cnblogs.com";

        // 获取该网页document文档数据
        Document document = Jsoup.connect(url)
                .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36")
                .get();

        log.info("document: {}", document);

        Elements elements = document.getElementsByClass("post-item-title");
        log.info("文章url数量: {}", elements.size());
        for (Element element : elements) {
            String newsUrl = element.attr("href");
            String title = element.text();
//            if (title.length() > 30) {
//                title = title.substring(0, 29);
//            }
            log.info("新闻的url: {}", newsUrl);
            log.info("标题: {}", title);
        }

    }
}
